change workdir

Load required libraries

## 
## ==> 2024-04-24 21:02:41 loading required packages ...

Load functions

Input files

## set working directory
outdir <- file.path(workdir, "output", fsep = "/")
gff_file <- file.path(workdir, "../genomes/GCA_000014265.1_ASM1426v1_genomic.gff", fsep = "/")
count_file=paste(workdir, "Tricho_featureCounts_result.tsv", sep="/")
metadata_file=paste(workdir, "metadata.tsv", sep="/")
if (workdir != getwd()) system(paste("mkdir -p", workdir, sep=" "))
setwd(workdir)
if (!dir.exists(outdir)) {
    dir.create(outdir)
} else {
    cat("\n==>", as.character(Sys.time()),  "Output directory ", outdir, "exists, will use it.\n")
}
## 
## ==> 2024-04-24 21:02:50 Output directory  /Users/shengwei/GitHub/Tricho-Transcriptomics/differential-expression-analysis/output exists, will use it.
prefix ="Tricho"
prefix <- paste(outdir, prefix, sep="/")

GFF annotation file preprocessing

# read in gff file
gff <- gffReader(gff_file)
## 
## ==> 2024-04-24 21:02:50 reading gff file  /Users/shengwei/GitHub/Tricho-Transcriptomics/differential-expression-analysis/../genomes/GCA_000014265.1_ASM1426v1_genomic.gff 
## 
## ==> 2024-04-24 21:02:50 class of gff is  data.frame 
## 
## ==> 2024-04-24 21:02:50  found 9672 rows with classes: character, character, character, integer, integer, character, character, character, character 
## 
## ==> 2024-04-24 21:02:50  found 9672 rows after filtration!
#View(gff)

# get gene_gff
gene_gff <- gff[gff$feature =="gene", ]
#fix(gene_gff)
#View(gene_gff)

# only keep features that are not "gene", 
otherFeatures <- unique(gff[, 3])
#otherFeatures
otherFeatures <- otherFeatures[-which(otherFeatures %in% c("gene", "region", "exon"))]
#otherFeatures

# get other_gff
other_gff <- data.frame(seqname=character(0), source=character(0), feature=character(0), 
                        start=integer(0), end=integer(0), score=character(0), 
                        strand=character(0), frame=character(0), atrributes=character(0))
for (i in 1:length(otherFeatures)){
  #print(otherFeatures[i])
  other_gff <- rbind(other_gff, gff[gff$feature == otherFeatures[i], ])
}
#fix(other_gff)

# bind geneAttributesColumn and geneID to other_gff
other_gff <- addGeneAttributesColumn(other_gff, gene_gff)
locus_tag <- getAttributeField(other_gff[, "geneAttributes"], "locus_tag", attrsep = ";")
other_gff <- cbind(other_gff, locus_tag=locus_tag)
#head(other_gff)
#View(other_gff)

# extract product, make annotations
product <- getAttributeField(other_gff[, "attributes"], "product", attrsep = ";")
dbxref <- getAttributeField(other_gff[, "attributes"], "Dbxref", attrsep = ";")
note <- getAttributeField(other_gff[, "attributes"], "Note", attrsep = ";")
annotations <- other_gff[, c("geneID", "locus_tag", "seqname", "feature", "start", "end", "strand", "attributes")]
annotations <- cbind(annotations, product, dbxref, note)
# reorganize the order of product column
annotations <- annotations[, c("geneID", "locus_tag", "feature", "start", "end", "strand", "seqname", "product", "dbxref", "note", "attributes")]
#View(annotations)
#head(annotations)
cat("\n==>", as.character(Sys.time()), "final annotations look like: \n")
## 
## ==> 2024-04-24 21:02:52 final annotations look like:
head(annotations, 2)
##   geneID locus_tag feature start  end strand    seqname
## 3  gene0 Tery_0001     CDS    27 1397      + CP000393.1
## 5  gene1 Tery_0002     CDS  1489 1689      - CP000393.1
##                                                                product
## 3                       chromosomal replication initiator protein DnaA
## 5 putative transposase gene of IS630 family insertion sequence ISY100h
##                                                                                           dbxref
## 3 InterPro:IPR001957,InterPro:IPR003593,InterPro:IPR013159,InterPro:IPR013317,NCBI_GP:ABG49531.1
## 5                                                                             NCBI_GP:ABG49532.1
##                                                                                                                                                                                                                               note
## 3 KEGG: ana:alr2009 chromosomal replication initiator protein~TIGRFAM: chromosomal replication initiator protein DnaA~PFAM: Chromosomal replication initiator%2C DnaA-like Chromosomal replication initiator%2C DnaA~SMART: ATPase
## 5                                                                                                                                           KEGG: syn:sll0431 putative transposase gene of IS630 family insertion sequence ISY100h
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                attributes
## 3 ID=cds0;Parent=gene0;Dbxref=InterPro:IPR001957,InterPro:IPR003593,InterPro:IPR013159,InterPro:IPR013317,NCBI_GP:ABG49531.1;Name=ABG49531.1;Note=KEGG: ana:alr2009 chromosomal replication initiator protein~TIGRFAM: chromosomal replication initiator protein DnaA~PFAM: Chromosomal replication initiator%2C DnaA-like Chromosomal replication initiator%2C DnaA~SMART: ATPase;gbkey=CDS;product=chromosomal replication initiator protein DnaA;protein_id=ABG49531.1;transl_table=11
## 5                                                                                                                                                                                                 ID=cds1;Parent=gene1;Dbxref=NCBI_GP:ABG49532.1;Name=ABG49532.1;Note=KEGG: syn:sll0431 putative transposase gene of IS630 family insertion sequence ISY100h;gbkey=CDS;product=putative transposase gene of IS630 family insertion sequence ISY100h;protein_id=ABG49532.1;transl_table=11

Count table preprocessing

rawCounts <- read.table(file = count_file, header = T, sep = "\t", as.is = T, stringsAsFactors = F)
#fix(rawCounts)
#head(rawCounts)
#ncol(rawCounts)

# get counts table
rawTable <- rawCounts[, 7:ncol(rawCounts)]
rownames(rawTable) <- rawCounts[, 1]
#head(rawTable)
counts <- sapply(rawTable, as.numeric)
rownames(counts) <- rownames(rawTable)
cat("\n==>", as.character(Sys.time()), " raw count table looks like: \n")
## 
## ==> 2024-04-24 21:02:52  raw count table looks like:
head(counts, 2)
##           HFe_d5_rep1 HFe_d5_rep2 HFe_d5_rep3 LFe_d5_rep1 LFe_d5_rep2
## Tery_0001         328         518         394         303         537
## Tery_0002          20          35          34          17          31
##           MFe_d5_rep1 MFe_d5_rep2
## Tery_0001         473         262
## Tery_0002          17          18

filter out weakly expressed genes and rRNAs/tRNAs

# calculate count per million
cpms <- cpm(counts)
#head(cpms)
cat("\n==>", as.character(Sys.time()), " Dimension before filtering: ", dim(cpms), "\n")
## 
## ==> 2024-04-24 21:02:52  Dimension before filtering:  4499 7
# keep rows meet requirements: at least 1 column has cpm more than 1, rowSums more than 0.5*ncol
keep <- rowSums(cpms > 1) >=1 & rowSums(cpms) >= ncol(cpms)*0.5
counts <- counts[keep, ]
cat("\n==>", as.character(Sys.time()), " Dimension after filtering weakly expressed genes: ", dim(counts), "\n")
## 
## ==> 2024-04-24 21:02:52  Dimension after filtering weakly expressed genes:  4249 7
# remove rows belongs to rRNA and tRNA
#View(other_gff)
#remove <- other_gff[other_gff$feature == "rRNA" | other_gff$feature == "tRNA", "geneID"]
remove <- other_gff[other_gff$feature == "rRNA", "geneID"]
#remove

counts <- counts[!row.names(counts)%in%remove, ]
#head(counts)
#dim(counts)
cat("\n==>", as.character(Sys.time()), " Dimension after removing weakly expressed genes and rRNA/tRNA: ", dim(counts))
## 
## ==> 2024-04-24 21:02:52  Dimension after removing weakly expressed genes and rRNA/tRNA:  4249 7
#View(counts)

read in metadata

cat("\n==>", as.character(Sys.time()), " making metatable ...\n")
## 
## ==> 2024-04-24 21:02:52  making metatable ...
# build a meta table, giving all experimental factors
#colnames(counts)

# read in metadata
metadata <- read.table(metadata_file, header = TRUE, as.is = TRUE, stringsAsFactors = FALSE, row.names = 1)
if(!all.equal(rownames(metadata), colnames(counts))) {
  stop("sample names in metadata didn't match column names in count table!\n")
}
cat("\n==>", as.character(Sys.time()), " metadata looks like: \n")
## 
## ==> 2024-04-24 21:02:52  metadata looks like:
head(metadata,7)
##             day replicate treatment treatment_day       color pch
## HFe_d5_rep1   5         1       HFe        HFe_d5 forestgreen  16
## HFe_d5_rep2   5         2       HFe        HFe_d5 forestgreen  16
## HFe_d5_rep3   5         3       HFe        HFe_d5 forestgreen  16
## LFe_d5_rep1   5         1       LFe        LFe_d5         red  16
## LFe_d5_rep2   5         2       LFe        LFe_d5         red  16
## MFe_d5_rep1   5         1       MFe        MFe_d5     yellow3  16
## MFe_d5_rep2   5         2       MFe        MFe_d5     yellow3  16

get edgeR TMM normalized CPM and log2CPM counts

cat("\n==>", as.character(Sys.time()), " running edgeR TMM normalization ...\n")
## 
## ==> 2024-04-24 21:02:52  running edgeR TMM normalization ...
# make DGEList
cat("\n==>", as.character(Sys.time()), " making DEGList with counts and metadata ...\n")
## 
## ==> 2024-04-24 21:02:52  making DEGList with counts and metadata ...
d <- DGEList(counts=counts, group=as.factor(metadata$treatment))
#d
cat("\n==>", as.character(Sys.time()), " calculating normalization factors using TMM method ...\n")
## 
## ==> 2024-04-24 21:02:52  calculating normalization factors using TMM method ...
d <- calcNormFactors(d, method = "TMM")
#d
cat("\n==>", as.character(Sys.time()), " calculating count per million (cpm) using normalized DGEList object ...\n")
## 
## ==> 2024-04-24 21:02:52  calculating count per million (cpm) using normalized DGEList object ...
cpms<- cpm(d, normalized.lib.sizes=TRUE)
#head(cpms)
cat("\n==>", as.character(Sys.time()), " formating cpm counts to scientific format with 2 digits ...\n")
## 
## ==> 2024-04-24 21:02:52  formating cpm counts to scientific format with 2 digits ...
cpms.formatted <- format(cpms, digits=2, scientific = F)
colnames(cpms.formatted) <- paste(colnames(cpms.formatted), "_CPM", sep = "")
cat("\n==>", as.character(Sys.time()), " formated cpm looks like this:\n")
## 
## ==> 2024-04-24 21:02:52  formated cpm looks like this:
head(cpms.formatted, 2)
##           HFe_d5_rep1_CPM HFe_d5_rep2_CPM HFe_d5_rep3_CPM LFe_d5_rep1_CPM
## Tery_0001 "    89.95"     "    79.68"     "    72.19"     "    99.83"    
## Tery_0002 "     5.49"     "     5.38"     "     6.23"     "     5.60"    
##           LFe_d5_rep2_CPM MFe_d5_rep1_CPM MFe_d5_rep2_CPM
## Tery_0001 "    97.53"     "    84.73"     "    73.85"    
## Tery_0002 "     5.63"     "     3.05"     "     5.07"
# compute log2CPM
cat("\n==>", as.character(Sys.time()), " computing log2 transformed cpms ...\n")
## 
## ==> 2024-04-24 21:02:52  computing log2 transformed cpms ...
cpms.log <- log2(cpms+0.01)
cpms.log.formatted <- format(cpms.log, digits=2, scientific = F)
colnames(cpms.log.formatted) <- paste(colnames(cpms), "_logCPM", sep = "")
cat("\n==> formated logCPM looks like this:\n")
## 
## ==> formated logCPM looks like this:
head(cpms.log.formatted, 2)
##           HFe_d5_rep1_logCPM HFe_d5_rep2_logCPM HFe_d5_rep3_logCPM
## Tery_0001 " 6.4913"          " 6.3164"          " 6.1740"         
## Tery_0002 " 2.4581"          " 2.4314"          " 2.6415"         
##           LFe_d5_rep1_logCPM LFe_d5_rep2_logCPM MFe_d5_rep1_logCPM
## Tery_0001 " 6.6415"          " 6.6080"          " 6.4050"         
## Tery_0002 " 2.4882"          " 2.4958"          " 1.6114"         
##           MFe_d5_rep2_logCPM
## Tery_0001 " 6.2068"         
## Tery_0002 " 2.3459"
cat("\n==>", as.character(Sys.time()), " writing out cpms ...\n")
## 
## ==> 2024-04-24 21:02:52  writing out cpms ...
# annotate the cpm table, then write it out with annotations
rawCounts.selected <- rawTable[rownames(cpms), ]
#head(rawCounts.selected)
colnames(rawCounts.selected) <- paste(colnames(rawCounts.selected), "_RAW", sep = "")
#head(rawCounts.selected)
rawCount_cpms <- cbind(rawCounts.selected, cpms.formatted, cpms.log.formatted)
#head(rawCount_cpms)
annotations.selected <- annotations[annotations$locus_tag%in%rownames(rawCount_cpms), ]
#View(annotations.selected)
rownames(annotations.selected) <- annotations.selected$locus_tag
anno_cols <- c('locus_tag', 'geneID', 'feature', 'start', 'end', 'strand', 'seqname', 'product', 'note', 'dbxref')
annotations.selected <- annotations.selected[rownames(cpms.formatted), anno_cols]
#head(annotations.selected)
rawCount_cpms.anno <- cbind(rawCount_cpms, annotations.selected)
#head(rawCount_cpms.anno)
write.table(rawCount_cpms, file=paste(prefix, "_featureCounts_CPM.tsv", sep=""), sep="\t", row.names = T, col.names=NA)
write.table(rawCount_cpms.anno, file=paste(prefix, "_featureCounts_CPM_anno.tsv", sep=""), sep="\t", row.names = T, col.names=NA)

#pdf(file = paste(prefix, "_featureCounts_top1000_logCPM.pdf", sep="")) 
pheatmap(cpms.log[1:500, ], cluster_cols = F)

#dev.off()

Differential Expression Analysis

construct DESeqDataSet object

library(DESeq2)
library(ggplot2)

cat("\n==>", as.character(Sys.time()), " importing counts and metadata into DESeq: \n")
## 
## ==> 2024-04-24 21:02:53  importing counts and metadata into DESeq:
dds <- DESeqDataSetFromMatrix(countData = counts,
                              colData = metadata,
                              design= ~ treatment)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors

check read number per sample

cat("\n==>", as.character(Sys.time()), " total read number per sample are ... \n")
## 
## ==> 2024-04-24 21:02:53  total read number per sample are ...
total_read_no <- as.data.frame(colSums(assay(dds)))
colnames(total_read_no) <- c("TotalReads")
total_read_no <- cbind(total_read_no, metadata[rownames(total_read_no), ])
total_read_no$SampleName <- rownames(total_read_no)
total_read_no
##             TotalReads day replicate treatment treatment_day       color pch
## HFe_d5_rep1    3433852   5         1       HFe        HFe_d5 forestgreen  16
## HFe_d5_rep2    6134662   5         2       HFe        HFe_d5 forestgreen  16
## HFe_d5_rep3    5446224   5         3       HFe        HFe_d5 forestgreen  16
## LFe_d5_rep1    3025047   5         1       LFe        LFe_d5         red  16
## LFe_d5_rep2    5521660   5         2       LFe        LFe_d5         red  16
## MFe_d5_rep1    5754829   5         1       MFe        MFe_d5     yellow3  16
## MFe_d5_rep2    3881938   5         2       MFe        MFe_d5     yellow3  16
##              SampleName
## HFe_d5_rep1 HFe_d5_rep1
## HFe_d5_rep2 HFe_d5_rep2
## HFe_d5_rep3 HFe_d5_rep3
## LFe_d5_rep1 LFe_d5_rep1
## LFe_d5_rep2 LFe_d5_rep2
## MFe_d5_rep1 MFe_d5_rep1
## MFe_d5_rep2 MFe_d5_rep2
my_theme1 <- theme_bw() + 
  theme(panel.background = element_blank(), 
        axis.line = element_line(colour = "black"), 
        text = element_text(size = 16),
        axis.title.x = element_text(size=18, color="black"), 
        axis.title.y = element_text(size=18, color="black"), 
        axis.text.x = element_text(angle = 0, hjust = 1, color="black"), 
        panel.grid.minor.x = element_line(colour = "grey", linewidth=0.2, linetype = 'dashed'), 
        panel.grid.major.x = element_line(colour = "grey", linewidth=0.2),
        panel.grid.minor.y = element_line(colour = "grey", linewidth = 0.2, linetype = 'dashed'),
        panel.grid.major.y = element_line(colour = "grey", linewidth=0.2),
        legend.position = "bottom", 
        legend.text=element_text(size=10), 
        legend.key.size = unit(1,"line"), 
        plot.margin=unit(c(1,1,1,1),"cm")
       ) 

p_reads <- ggplot(total_read_no, aes(x = SampleName, y = TotalReads, fill = factor(color))) + 
  geom_bar(stat="identity", width=0.5) +
  geom_abline(slope=0, intercept=3e6, col="red",lty=2) +
  scale_fill_manual(values = c("forestgreen", "red", "yellow3"), labels = c("HFe", "LFe", "MFe")) + 
  labs(y = "Total Read Number", x = "Sample") + my_theme1 +
  theme(axis.text.x = element_text(angle = 60, hjust = 1, size=10), axis.text.y = element_text(size=10))
ggsave(filename=paste(prefix, "_sample_read_number_barplot.pdf", sep=""), device="pdf", width=297, height=210, units="mm" )
p_reads

variance stablizing transformation and regularized logarithm (rlog) transformation

cat("\n==>", as.character(Sys.time()), " running vst transformation ... \n")
## 
## ==> 2024-04-24 21:02:54  running vst transformation ...
vsd <- vst(dds, blind=FALSE)
vst_count <- assay(vsd)
cat("\n==>", as.character(Sys.time()), " running rlog transformation ... \n")
## 
## ==> 2024-04-24 21:02:55  running rlog transformation ...
rld <- rlog(dds, blind=FALSE)
rlog_count <- assay(rld)
cat("\n==>", as.character(Sys.time()), " vst transformed counts look like: \n")
## 
## ==> 2024-04-24 21:02:56  vst transformed counts look like:
head(vst_count, 3)
##           HFe_d5_rep1 HFe_d5_rep2 HFe_d5_rep3 LFe_d5_rep1 LFe_d5_rep2
## Tery_0001    8.797390    8.641845    8.508486    8.936019    8.905984
## Tery_0002    5.781632    5.770847    5.877970    5.796746    5.801250
## Tery_0006    5.781632    6.014847    6.448433    5.839421    5.533193
##           MFe_d5_rep1 MFe_d5_rep2
## Tery_0001    8.724222    8.548865
## Tery_0002    5.398637    5.731818
## Tery_0006    5.398637    6.585203
cat("\n==>", as.character(Sys.time()), " rlog transformed counts look like: \n")
## 
## ==> 2024-04-24 21:02:56  rlog transformed counts look like:
head(rlog_count, 3)
##           HFe_d5_rep1 HFe_d5_rep2 HFe_d5_rep3 LFe_d5_rep1 LFe_d5_rep2
## Tery_0001    8.649126    8.542002    8.450973    8.744031    8.725327
## Tery_0002    4.584050    4.579917    4.652827    4.591704    4.600004
## Tery_0006    4.864530    5.017129    5.334146    4.903892    4.692005
##           MFe_d5_rep1 MFe_d5_rep2
## Tery_0001    8.598987    8.480152
## Tery_0002    4.340565    4.553361
## Tery_0006    4.609619    5.410122
# write vst and rlog counts
write.table(vst_count, file=paste(prefix, "_vst_count.tsv", sep=""), sep="\t", row.names = T, col.names=NA)
write.table(rlog_count, file=paste(prefix, "_rlog_count.tsv", sep=""), sep="\t", row.names = T, col.names=NA)
# boxplot of vst and rlog transformed counts

# plot normalized count
logCount <- as.data.frame(log10(assay(dds)+1)) %>% rownames_to_column %>% 
      gather(key="SampleName", value="Counts", -rowname)
p_logCount <- ggplot(logCount, aes(x=SampleName, y=Counts)) + 
    geom_boxplot(outlier.colour="black", outlier.shape=1, outlier.size=1) + 
    labs(y = "log counts", x = "Sample") + theme(axis.text.x = element_text(angle = 60, hjust = 1, size=10))
vst_count.long <- as.data.frame(vst_count) %>%  rownames_to_column %>% 
      gather(key="SampleName", value="Counts", -rowname)
p_vst <- ggplot(vst_count.long, aes(x=SampleName, y=Counts)) + 
    geom_boxplot(outlier.colour="black", outlier.shape=1, outlier.size=1) + 
    labs(y = "vst counts", x = "Sample")  + theme(axis.text.x = element_text(angle = 60, hjust = 1, size=10))
rlog_count.long <- as.data.frame(rlog_count) %>%  rownames_to_column %>% 
      gather(key="SampleName", value="Counts", -rowname)
p_rlog <- ggplot(rlog_count.long, aes(x=SampleName, y=Counts)) + 
    geom_boxplot(outlier.colour="black", outlier.shape=1, outlier.size=1) + 
    labs(y = "rlog counts", x = "Sample")  + theme(axis.text.x = element_text(angle = 60, hjust = 1, size=10))

p_counts <- p_logCount + p_vst + p_rlog
ggsave(filename=paste(prefix, "_sample_vst_rlog_norm_boxplot.pdf", sep=""), device="pdf", width=297, height=210, units="mm" )
p_counts

Exploratory Analysis

PCA using DESeq2

# use ggrepel to label the dots
library(ggrepel)

# PCA based on vsd
pcaData <- plotPCA(vsd, intgroup=c("treatment", "day"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
p_vst_pca <- ggplot(pcaData, aes(PC1, PC2)) +
  geom_point(aes(color=treatment, shape=as.character(metadata$pch)), size=3) +
  coord_fixed() +
  ggtitle("PCA based on VST counts") + 
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  geom_text_repel(aes(label=rownames(metadata)), size=3) +
  #geom_text(aes(label=rownames(metadata), hjust=0.5, vjust=-0.4)) + 
  scale_color_manual(values=unique(as.character(metadata$color)[order(metadata$treatment)])) + 
  scale_shape_manual(values=unique(as.numeric(metadata$pch)[order(metadata$pch)])) +
  theme_bw() +
  theme(legend.position="bottom")

# PCA based on rlog
pcaData <- plotPCA(rld, intgroup=c("treatment", "day"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
p_rlog_pca <- ggplot(pcaData, aes(PC1, PC2)) +
  geom_point(aes(color=treatment, shape=as.character(metadata$pch)), size=3) +
  coord_fixed() +
  ggtitle("PCA based on rlog counts") + 
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) +
  geom_text_repel(aes(label=rownames(metadata)), size=3) +
  #geom_text(aes(label=rownames(metadata), hjust=0.5, vjust=-0.4)) +
  scale_color_manual(values=unique(as.character(metadata$color)[order(metadata$treatment)])) + 
  scale_shape_manual(values=unique(as.numeric(metadata$pch)[order(metadata$pch)])) +  
  theme_bw() +
  theme(legend.position="bottom")

p_pca <- p_vst_pca + p_rlog_pca
ggsave(filename=paste(prefix, "_sample_vst_rlog_PCA.pdf", sep=""), device="pdf", width=297, height=210, units="mm" )
p_pca

sample dendrogram based on vst counts

# clustering using Euclidean distance based on vst counts
vst_dist <- dist(t(vst_count))
# the agglomeration method can be "ward.D", "ward.D2", "single", "complete", 
# "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC)
vst_clust <- hclust(vst_dist, method="ward.D2")
vst_dend <- as.dendrogram(vst_clust, hang=0.1)
vst_dend_cols <- as.character(metadata$color[order.dendrogram(vst_dend)])
labels_colors(vst_dend) <- vst_dend_cols

# save to pdf
#pdf(file = paste(prefix, "_sample_vst_dendrogram.pdf", sep="")) 
plot(vst_dend, ylab="Euclidean Distance based on VST counts")

#dev.off()

sample dendrogram based on rlog counts

# clustering using Euclidean distance based on rlog counts
rlog_dist <- dist(t(rlog_count))
rlog_clust <- hclust(rlog_dist, method="ward.D2")
rlog_dend <- as.dendrogram(rlog_clust, hang=0.1)
rlog_dend_cols <- as.character(metadata$color[order.dendrogram(rlog_dend)])
labels_colors(rlog_dend) <- rlog_dend_cols

#pdf(file = paste(prefix, "_sample_rlog_dendrogram.pdf", sep="")) 
plot(rlog_dend, ylab="Euclidean Distance based on rlog counts")

#dev.off()

PCoA using PhyloSeq with vst counts

# making our phyloseq object with transformed table from vst count
vst_count_phy <- otu_table(vst_count, taxa_are_rows=T)
metadata_phy <- sample_data(metadata)
vst_phyloseq <- phyloseq(vst_count_phy, metadata_phy)
vst_pcoa <- ordinate(vst_phyloseq, method="MDS", distance="euclidean")
# eigen_vals allows us to scale the axes according to their magnitude of separating apart the samples
eigen_vals <- vst_pcoa$values$Eigenvalues 
p_vst_pcoa <- plot_ordination(vst_phyloseq, vst_pcoa) + 
  geom_point(aes(color=treatment, shape=as.character(metadata$pch)), size=3) + 
  labs(col="treatment") +
  geom_text_repel(aes(label=rownames(metadata)), size=3) +
  coord_fixed(sqrt(eigen_vals[2]/eigen_vals[1])) + ggtitle("PCoA based on VST counts") + 
  scale_color_manual(values=unique(as.character(metadata$color)[order(metadata$treatment)])) + 
  scale_shape_manual(values=unique(as.numeric(metadata$pch)[order(metadata$pch)])) +  
  theme_bw() +
  theme(legend.position="bottom")
ggsave(filename=paste(prefix, "_sample_vst_PCoA.pdf", sep=""), device="pdf", width=297, height=210, units="mm" )

# making our phyloseq object with transformed table from rlog count
rlog_count_phy <- otu_table(rlog_count, taxa_are_rows=T)
metadata_phy <- sample_data(metadata)
rlog_phyloseq <- phyloseq(rlog_count_phy, metadata_phy)
rlog_pcoa <- ordinate(rlog_phyloseq, method="MDS", distance="euclidean")
eigen_vals <- rlog_pcoa$values$Eigenvalues
p_rlog_pcoa <- plot_ordination(rlog_phyloseq, rlog_pcoa) + 
  geom_point(aes(color=treatment, shape=as.character(metadata$pch)), size=3) + 
  labs(col="treatment") +
  geom_text_repel(aes(label=rownames(metadata)), size=3) +
  coord_fixed(sqrt(eigen_vals[2]/eigen_vals[1])) + ggtitle("PCoA based on rlog counts") + 
  scale_color_manual(values=unique(as.character(metadata$color)[order(metadata$treatment)])) + 
  scale_shape_manual(values=unique(as.numeric(metadata$pch)[order(metadata$pch)])) +  
  theme_bw() +
  theme(legend.position="bottom")

p_pcoa <- p_vst_pcoa + p_rlog_pcoa
ggsave(filename=paste(prefix, "_sample_vst_rlog_PCoA.pdf", sep=""), device="pdf", width=297, height=210, units="mm" )
p_pcoa

DE based on treatment

# function to write DE tables
write_DE_tables <- function(contrast, suffix_str){
  contrast <- contrast[order(contrast$padj, decreasing=F),]
  anno <- rawCount_cpms.anno[rownames(contrast), ]
  contrast.anno <- cbind(contrast, anno)
  write.table(contrast, file=paste(prefix, suffix_str, ".tsv", sep=""), sep="\t", row.names = T, col.names=NA)
  write.table(contrast.anno, file=paste(prefix, suffix_str, "_anno.tsv", sep=""), sep="\t", row.names = T, col.names=NA)
  write.table(contrast.anno[contrast$log2FoldChange>0, ], file=paste(prefix, suffix_str, "_anno_up.tsv", sep=""), sep="\t", row.names = T, col.names=NA)
  write.table(contrast.anno[contrast$log2FoldChange<=0, ], file=paste(prefix, suffix_str, "_anno_dn.tsv", sep=""), sep="\t", row.names = T, col.names=NA)
}

# construct DESeqDataSet object
dds <- DESeqDataSetFromMatrix(countData = counts,
                              colData = metadata,
                              design= ~ treatment)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
# set the baseline treatment as the "LFe" treatment
dds$treatment <- relevel(dds$treatment, ref = "LFe")

# run deseq standard analysis
design(dds) <- ~treatment
dds.treatment <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
# extract size factors
size_facctors <- sizeFactors(dds.treatment)
size_facctors
## HFe_d5_rep1 HFe_d5_rep2 HFe_d5_rep3 LFe_d5_rep1 LFe_d5_rep2 MFe_d5_rep1 
##   0.8004557   1.4216113   1.1968956   0.6665582   1.2081094   1.2196937 
## MFe_d5_rep2 
##   0.7717292
# Dispersion plot 
head(dispersions(dds.treatment))
## [1] 0.02286648 0.06576208 0.12845667 0.02085939 0.03464709 0.03714544
plotDispEsts(dds.treatment, cex=0.5, ymin=1e-05)

# get per level baseMean
baseMeanPerLvl <- sapply(levels(dds$treatment), 
                         function(lvl) rowMeans(counts(dds.treatment,normalized=TRUE)[,dds$treatment == lvl]))
colnames(baseMeanPerLvl) <- sapply(levels(dds$treatment), function(lvl) paste0("baseMean_", lvl))
head(baseMeanPerLvl)
##           baseMean_LFe baseMean_HFe baseMean_MFe
## Tery_0001    449.53505    367.77561    363.64980
## Tery_0002     25.58204     26.00418     18.63108
## Tery_0006     22.19346     37.96431     38.71585
## Tery_0008    518.28818    509.83442    511.90678
## Tery_0011    184.42879    222.08368    235.84973
## Tery_0012    365.20928    373.97958    356.84321
# ------------------
# HFe vs LFe samples
# ------------------
HFe_vs_LFe_contrast <- results(dds.treatment, alpha=0.05, contrast=c("treatment", "HFe", "LFe"))
head(HFe_vs_LFe_contrast)
## log2 fold change (MLE): treatment HFe vs LFe 
## Wald test p-value: treatment HFe vs LFe 
## DataFrame with 6 rows and 6 columns
##            baseMean log2FoldChange     lfcSE       stat    pvalue      padj
##           <numeric>      <numeric> <numeric>  <numeric> <numeric> <numeric>
## Tery_0001  389.9566     -0.2915776  0.209951 -1.3887889  0.164897  0.517665
## Tery_0002   23.7770      0.0243618  0.428993  0.0567885  0.954714        NA
## Tery_0006   33.6731      0.8111985  0.541987  1.4967132  0.134468  0.466698
## Tery_0008  512.8419     -0.0213121  0.199397 -0.1068829  0.914882  0.978458
## Tery_0011  215.2583      0.2596615  0.263532  0.9853132  0.324470  0.707095
## Tery_0012  366.5777      0.0405920  0.263543  0.1540241  0.877591  0.967661
# combine res with baseMeanPerLvl
HFe_vs_LFe_contrast.lvl <- cbind(HFe_vs_LFe_contrast['baseMean'], 
                                 baseMeanPerLvl, 
                                 HFe_vs_LFe_contrast[, 2:ncol(HFe_vs_LFe_contrast)])
# check the results
head(HFe_vs_LFe_contrast.lvl)
## DataFrame with 6 rows and 9 columns
##            baseMean baseMean_LFe baseMean_HFe baseMean_MFe log2FoldChange
##           <numeric>    <numeric>    <numeric>    <numeric>      <numeric>
## Tery_0001  389.9566     449.5351     367.7756     363.6498     -0.2915776
## Tery_0002   23.7770      25.5820      26.0042      18.6311      0.0243618
## Tery_0006   33.6731      22.1935      37.9643      38.7158      0.8111985
## Tery_0008  512.8419     518.2882     509.8344     511.9068     -0.0213121
## Tery_0011  215.2583     184.4288     222.0837     235.8497      0.2596615
## Tery_0012  366.5777     365.2093     373.9796     356.8432      0.0405920
##               lfcSE       stat    pvalue      padj
##           <numeric>  <numeric> <numeric> <numeric>
## Tery_0001  0.209951 -1.3887889  0.164897  0.517665
## Tery_0002  0.428993  0.0567885  0.954714        NA
## Tery_0006  0.541987  1.4967132  0.134468  0.466698
## Tery_0008  0.199397 -0.1068829  0.914882  0.978458
## Tery_0011  0.263532  0.9853132  0.324470  0.707095
## Tery_0012  0.263543  0.1540241  0.877591  0.967661
# write 
write_DE_tables(HFe_vs_LFe_contrast, "_HFe_vs_LFe_contrast")
write_DE_tables(HFe_vs_LFe_contrast.lvl, "_HFe_vs_LFe_contrast_lvl")

# plot MA
par(mar = c(4, 4, .1, .1))
plotMA(HFe_vs_LFe_contrast, ylim=c(-2,2), main="HFe_vs_LFe", colSig = "blue", cex=.8)

Filtering criteria The goal of independent filtering is to filter out those tests from the procedure that have no, or little chance of showing significant evidence, without even looking at their test statistic.

Genes with very low counts are not likely to see significant differences typically due to high dispersion. For example, we can plot the −log10(p-values) from all genes over the normalized mean counts.

plot(HFe_vs_LFe_contrast$baseMean+1, -log10(HFe_vs_LFe_contrast$pvalue), main="HFe_vs_LFe",
     log="x", xlab="mean of normalized counts",
     ylab=expression(-log[10](pvalue)),
     ylim=c(0,30),
     cex=.4, col=rgb(0,0,0,.3))

# p-value histogram
# the p value histogram below shows how the filtering ameliorates the multiple testing problem,  
# by removing a background set of hypotheses whose p values are distributed more or less uniformly in [0,1].
cat("\n==>", as.character(Sys.time()), "Filtering criteria:", metadata(HFe_vs_LFe_contrast)$filterThreshold,  "\n")
## 
## ==> 2024-04-24 21:03:03 Filtering criteria: 33.37568
use <- HFe_vs_LFe_contrast$baseMean > metadata(HFe_vs_LFe_contrast)$filterThreshold
h1 <- hist(HFe_vs_LFe_contrast$pvalue[!use], breaks=0:50/50, plot=FALSE)
h2 <- hist(HFe_vs_LFe_contrast$pvalue[use], breaks=0:50/50, plot=FALSE)
colori <- c(`do not pass`="khaki", `pass`="powderblue")

# Histogram of p values for all tests. 
# The area shaded in blue indicates the subset of those that pass the filtering, the area in khaki those that do not pass
{
  barplot(height = rbind(h1$counts, h2$counts), beside = FALSE, col = colori, space = 0, main = "HFe_vs_LFe", ylab="frequency") 
  text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0,1)), adj = c(0.5,1.7), xpd=NA) 
  legend("topright", fill=rev(colori), legend=rev(names(colori)))
}

# ------------------
# MFe vs LFe samples
# ------------------
MFe_vs_LFe_contrast <- results(dds.treatment, alpha=0.05, contrast=c("treatment", "MFe", "LFe"))

# combine res with baseMeanPerLvl
MFe_vs_LFe_contrast.lvl <- cbind(MFe_vs_LFe_contrast['baseMean'], 
                                 baseMeanPerLvl, 
                                 MFe_vs_LFe_contrast[, 2:ncol(MFe_vs_LFe_contrast)])
# check the results
head(MFe_vs_LFe_contrast.lvl)
## DataFrame with 6 rows and 9 columns
##            baseMean baseMean_LFe baseMean_HFe baseMean_MFe log2FoldChange
##           <numeric>    <numeric>    <numeric>    <numeric>      <numeric>
## Tery_0001  389.9566     449.5351     367.7756     363.6498     -0.3029813
## Tery_0002   23.7770      25.5820      26.0042      18.6311     -0.4968857
## Tery_0006   33.6731      22.1935      37.9643      38.7158      0.7918668
## Tery_0008  512.8419     518.2882     509.8344     511.9068     -0.0150832
## Tery_0011  215.2583     184.4288     222.0837     235.8497      0.3572647
## Tery_0012  366.5777     365.2093     373.9796     356.8432     -0.0213793
##               lfcSE       stat    pvalue      padj
##           <numeric>  <numeric> <numeric> <numeric>
## Tery_0001  0.230806 -1.3127124  0.189280  0.497956
## Tery_0002  0.491788 -1.0103657  0.312320        NA
## Tery_0006  0.592055  1.3374879  0.181063        NA
## Tery_0008  0.218807 -0.0689337  0.945042  0.982363
## Tery_0011  0.288434  1.2386348  0.215481  0.530029
## Tery_0012  0.289283 -0.0739045  0.941086  0.981672
# write 
write_DE_tables(MFe_vs_LFe_contrast, "_MFe_vs_LFe_contrast")
write_DE_tables(MFe_vs_LFe_contrast, "_MFe_vs_LFe_contrast_lvl")

# plot MA
par(mar = c(4, 4, .1, .1))
plotMA(MFe_vs_LFe_contrast, ylim=c(-2,2), main="MFe_vs_LFe", colSig = "blue", cex=.8)

plot(MFe_vs_LFe_contrast$baseMean+1, -log10(MFe_vs_LFe_contrast$pvalue), main="MFe_vs_LFe",
     log="x", xlab="mean of normalized counts",
     ylab=expression(-log[10](pvalue)),
     ylim=c(0,30),
     cex=.4, col=rgb(0,0,0,.3))

# p-value histogram
# the p value histogram below shows how the filtering ameliorates the multiple testing problem,  
# by removing a background set of hypotheses whose p values are distributed more or less uniformly in [0,1].
cat("\n==>", as.character(Sys.time()), "Filtering criteria:", metadata(MFe_vs_LFe_contrast)$filterThreshold,  "\n")
## 
## ==> 2024-04-24 21:03:04 Filtering criteria: 75.08633
use <- MFe_vs_LFe_contrast$baseMean > metadata(MFe_vs_LFe_contrast)$filterThreshold
h1 <- hist(MFe_vs_LFe_contrast$pvalue[!use], breaks=0:50/50, plot=FALSE)
h2 <- hist(MFe_vs_LFe_contrast$pvalue[use], breaks=0:50/50, plot=FALSE)
colori <- c(`do not pass`="khaki", `pass`="powderblue")

# Histogram of p values for all tests. 
# The area shaded in blue indicates the subset of those that pass the filtering, the area in khaki those that do not pass
{
  barplot(height = rbind(h1$counts, h2$counts), beside = FALSE, col = colori, space = 0, main = "MFe_vs_LFe", ylab="frequency") 
  text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0,1)), adj = c(0.5,1.7), xpd=NA) 
  legend("topright", fill=rev(colori), legend=rev(names(colori)))
}

# ------------------
# HFe vs MFe samples
# ------------------
HFe_vs_MFe_contrast <- results(dds.treatment, alpha=0.05, contrast=c("treatment", "HFe", "MFe"))

# combine res with baseMeanPerLvl
HFe_vs_MFe_contrast.lvl <- cbind(HFe_vs_MFe_contrast['baseMean'], 
                                 baseMeanPerLvl, 
                                 HFe_vs_MFe_contrast[, 2:ncol(HFe_vs_MFe_contrast)])
# check the results
head(HFe_vs_MFe_contrast.lvl)
## DataFrame with 6 rows and 9 columns
##            baseMean baseMean_LFe baseMean_HFe baseMean_MFe log2FoldChange
##           <numeric>    <numeric>    <numeric>    <numeric>      <numeric>
## Tery_0001  389.9566     449.5351     367.7756     363.6498     0.01140375
## Tery_0002   23.7770      25.5820      26.0042      18.6311     0.52124749
## Tery_0006   33.6731      22.1935      37.9643      38.7158     0.01933172
## Tery_0008  512.8419     518.2882     509.8344     511.9068    -0.00622894
## Tery_0011  215.2583     184.4288     222.0837     235.8497    -0.09760323
## Tery_0012  366.5777     365.2093     373.9796     356.8432     0.06197137
##               lfcSE       stat    pvalue      padj
##           <numeric>  <numeric> <numeric> <numeric>
## Tery_0001  0.210741  0.0541127  0.956845  0.998313
## Tery_0002  0.444496  1.1726719  0.240927  0.846590
## Tery_0006  0.518104  0.0373124  0.970236  0.998313
## Tery_0008  0.198940 -0.0313106  0.975022  0.998313
## Tery_0011  0.260008 -0.3753860  0.707373  0.979413
## Tery_0012  0.263072  0.2355677  0.813768  0.985314
# write
write_DE_tables(HFe_vs_MFe_contrast, "_HFe_vs_MFe_contrast")
write_DE_tables(HFe_vs_MFe_contrast, "_HFe_vs_MFe_contrast_lvl")

# plot MA
par(mar = c(4, 4, .1, .1))
plotMA(HFe_vs_MFe_contrast, ylim=c(-2,2), main="HFe_vs_MFe", colSig = "blue", cex=.8)

plot(HFe_vs_MFe_contrast$baseMean+1, -log10(HFe_vs_MFe_contrast$pvalue), main="HFe_vs_MFe",
     log="x", xlab="mean of normalized counts",
     ylab=expression(-log[10](pvalue)),
     ylim=c(0,30),
     cex=.4, col=rgb(0,0,0,.3))

# p-value histogram
# the p value histogram below shows how the filtering ameliorates the multiple testing problem,  
# by removing a background set of hypotheses whose p values are distributed more or less uniformly in [0,1].
cat("\n==>", as.character(Sys.time()), "Filtering criteria:", metadata(HFe_vs_MFe_contrast)$filterThreshold,  "\n")
## 
## ==> 2024-04-24 21:03:05 Filtering criteria: 4.167553
use <- HFe_vs_MFe_contrast$baseMean > metadata(HFe_vs_MFe_contrast)$filterThreshold
h1 <- hist(HFe_vs_MFe_contrast$pvalue[!use], breaks=0:50/50, plot=FALSE)
h2 <- hist(HFe_vs_MFe_contrast$pvalue[use], breaks=0:50/50, plot=FALSE)
colori <- c(`do not pass`="khaki", `pass`="powderblue")

# Histogram of p values for all tests. 
# The area shaded in blue indicates the subset of those that pass the filtering, the area in khaki those that do not pass
{
  barplot(height = rbind(h1$counts, h2$counts), beside = FALSE, col = colori, space = 0, main = "HFe_vs_MFe", ylab="frequency") 
  text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0,1)), adj = c(0.5,1.7), xpd=NA) 
  legend("topright", fill=rev(colori), legend=rev(names(colori)))
}

summarize DE genes across treatment comparison

# -------
# summary
# -------
# we can get a glimpse at what this table currently holds with the summary command
summary(HFe_vs_LFe_contrast)
## 
## out of 4249 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 181, 4.3%
## LFC < 0 (down)     : 127, 3%
## outliers [1]       : 0, 0%
## low counts [2]     : 577, 14%
## (mean count < 33)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
summary(MFe_vs_LFe_contrast)
## 
## out of 4249 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 156, 3.7%
## LFC < 0 (down)     : 59, 1.4%
## outliers [1]       : 0, 0%
## low counts [2]     : 1071, 25%
## (mean count < 75)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
summary(HFe_vs_MFe_contrast)
## 
## out of 4249 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 33, 0.78%
## LFC < 0 (down)     : 46, 1.1%
## outliers [1]       : 0, 0%
## low counts [2]     : 83, 2%
## (mean count < 4)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# make a stat table based on the summary
DE_stats <- "
contrast up  down
HFe_vs_LFe  181 127
HFe_vs_MFe  33  46
MFe_vs_LFe  156 59
"
de_stat_df <- read.table(text=DE_stats, header=T)
de_stat_df
##     contrast  up down
## 1 HFe_vs_LFe 181  127
## 2 HFe_vs_MFe  33   46
## 3 MFe_vs_LFe 156   59
mat <- de_stat_df[, c("up", "down")]
rownames(mat) <- de_stat_df$contrast
# save to pdf
#pdf(file = paste(prefix, "_DESeq2_DE_Overview.pdf", sep="")) 
pheatmap(mat=mat, angle_col = 0, fontsize_col = 10, fontsize_row = 10)

#dev.off()

Euler Diagrame of DE genes across treatment comparison

# get gene names
get_de_gene_names <- function(DESeq_contrast, padj_cutoff=0.01, log2FC_cutoff=1) {
  
    library(tidyverse)
    genes.sig_up <- as.data.frame(DESeq_contrast) %>% 
    rownames_to_column('locus_tag') %>%
    filter(!is.na(padj)) %>% 
    filter(padj <= padj_cutoff, log2FoldChange >= log2FC_cutoff) %>%
    column_to_rownames('locus_tag') %>% rownames()
    
    genes.sig_dn <- as.data.frame(DESeq_contrast) %>% 
      rownames_to_column('locus_tag') %>%
      filter(!is.na(padj)) %>% 
      filter(padj <= padj_cutoff, log2FoldChange <= -1*log2FC_cutoff) %>%
      column_to_rownames('locus_tag') %>% rownames()    
  
    return(list(genes.sig_up, genes.sig_dn))
    
}


padj_cutoff = 0.05
logFC_cutoff = 0.585

# HFe_vs_LFe sig
HFe_vs_LFe_sig <- get_de_gene_names(HFe_vs_LFe_contrast, 
                                    padj_cutoff=padj_cutoff, 
                                    log2FC_cutoff=logFC_cutoff)
HFe_vs_LFe_sig.up <- HFe_vs_LFe_sig[[1]]
HFe_vs_LFe_sig.dn <- HFe_vs_LFe_sig[[2]]

# HFe_vs_MFe sig
HFe_vs_MFe_sig <- get_de_gene_names(HFe_vs_MFe_contrast, 
                                    padj_cutoff=padj_cutoff, 
                                    log2FC_cutoff=logFC_cutoff)
HFe_vs_MFe_sig.up <- HFe_vs_MFe_sig[[1]]
HFe_vs_MFe_sig.dn <- HFe_vs_MFe_sig[[2]]

# MFe_vs_LFe sig
MFe_vs_LFe_sig <- get_de_gene_names(MFe_vs_LFe_contrast, 
                                    padj_cutoff=padj_cutoff, 
                                    log2FC_cutoff=logFC_cutoff)
MFe_vs_LFe_sig.up <- MFe_vs_LFe_sig[[1]]
MFe_vs_LFe_sig.dn <- MFe_vs_LFe_sig[[2]]


#BiocManager::install('eulerr')
library(eulerr)
up.list <- list(
  HFe_vs_LFe = HFe_vs_LFe_sig.up, 
  HFe_vs_MFe = HFe_vs_MFe_sig.up, 
  MFe_vs_LFe = MFe_vs_LFe_sig.up)
dn.list <- list(
  HFe_vs_LFe = HFe_vs_LFe_sig.dn, 
  HFe_vs_MFe = HFe_vs_MFe_sig.dn, 
  MFe_vs_LFe = MFe_vs_LFe_sig.dn)

Euler Diagrame of up-regulated DE genes across treatment comparison

#pdf(file = paste(prefix, "_Eule_diagram_padj0.05_logFC0.585_up.pdf", sep="")) 
plot(euler(up.list, shape = "ellipse"), main="Euler diagram of up regulated genes", 
     fills = c("limegreen", "dodgerblue", "darkgoldenrod1"),
     edges = FALSE,
     fontsize = 8,
     quantities = TRUE, 
     legend=TRUE)

#dev.off()

Euler Diagrame of down-regulated DE genes across treatment comparison

#pdf(file = paste(prefix, "_Eule_diagram_padj0.05_logFC0.585_dn.pdf", sep="")) 
plot(euler(dn.list, shape = "ellipse"), main="Euler diagram of down regulated genes", 
     fills = c("limegreen", "dodgerblue", "darkgoldenrod1"),
     edges = FALSE,
     fontsize = 8,
     quantities = TRUE, 
     legend=TRUE)

#dev.off()

plot the read counts for selected DE genes

# visualizing top gene (based on adj. p-value)
topGene <- rownames(HFe_vs_LFe_contrast)[which.min(HFe_vs_LFe_contrast$padj)]
topGene
## [1] "Tery_2483"
data <- plotCounts(dds.treatment, gene=topGene, intgroup=c("treatment"), returnData = T)
data$treatment <- factor(data$treatment, levels=c("LFe", "MFe", "HFe"))
data$replicate <- factor(colData(dds.treatment)$replicate, levels=c(1, 2, 3))
ggplot(data, aes(x=treatment, y=count, fill=treatment)) +
  scale_y_log10() + theme_bw() +
  geom_violin(alpha=0.3) + 
  geom_jitter(aes(color=treatment), shape=16, position=position_jitter(0.2)) +
  geom_smooth(method = "lm", se = TRUE) +
  ggtitle(topGene)
## `geom_smooth()` using formula = 'y ~ x'

# boxplot of differnet groups
# specify the comparisons you want
my_comparisons <- list(c("HFe", "LFe"), c("HFe", "MFe"), c("MFe", "LFe"))
p_gene <- ggpubr::ggboxplot(data, x = "treatment", y = "count",
          color = "treatment", palette = "jco",
          add = "jitter") + 
     rotate_x_text(angle = 45) +
     geom_hline(yintercept = mean(data$count), linetype=2) + # Add horizontal line at base mean
     stat_compare_means(method="anova", label.y=50) + # Add global p-value
     stat_compare_means(comparisons=my_comparisons, method="t.test", ref.group = "LFe") # Add pairwise comparisons p-value
p_gene

sessionInfo()
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    splines   stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] eulerr_7.0.1                ggrepel_0.9.5              
##  [3] ggpubr_0.6.0                lubridate_1.9.3            
##  [5] forcats_1.0.0               stringr_1.5.1              
##  [7] dplyr_1.1.4                 purrr_1.0.2                
##  [9] readr_2.1.5                 tidyr_1.3.1                
## [11] tibble_3.2.1                tidyverse_2.0.0            
## [13] patchwork_1.2.0             pheatmap_1.0.12            
## [15] apeglm_1.18.0               ggplot2_3.5.0              
## [17] phyloseq_1.42.0             dendextend_1.17.1          
## [19] DESeq2_1.36.0               SummarizedExperiment_1.26.1
## [21] MatrixGenerics_1.8.1        matrixStats_1.2.0          
## [23] GenomicRanges_1.48.0        GenomeInfoDb_1.32.4        
## [25] IRanges_2.30.1              S4Vectors_0.34.0           
## [27] NOISeq_2.40.0               Matrix_1.6-5               
## [29] Biobase_2.56.0              BiocGenerics_0.42.0        
## [31] edgeR_3.38.4                limma_3.52.4               
## [33] BiocManager_1.30.22        
## 
## loaded via a namespace (and not attached):
##   [1] backports_1.4.1        systemfonts_1.0.6      plyr_1.8.9            
##   [4] igraph_2.0.3.9008      polylabelr_0.2.0       BiocParallel_1.30.4   
##   [7] digest_0.6.35          foreach_1.5.2          htmltools_0.5.8       
##  [10] viridis_0.6.5          fansi_1.0.6            magrittr_2.0.3        
##  [13] memoise_2.0.1          cluster_2.1.6          tzdb_0.4.0            
##  [16] Biostrings_2.64.1      annotate_1.74.0        bdsmatrix_1.3-7       
##  [19] timechange_0.3.0       colorspace_2.1-0       blob_1.2.4            
##  [22] textshaping_0.3.7      xfun_0.43              crayon_1.5.2          
##  [25] RCurl_1.98-1.14        jsonlite_1.8.8         genefilter_1.78.0     
##  [28] survival_3.5-8         iterators_1.0.14       ape_5.7-1             
##  [31] glue_1.7.0             polyclip_1.10-6        gtable_0.3.4          
##  [34] zlibbioc_1.42.0        XVector_0.36.0         DelayedArray_0.22.0   
##  [37] car_3.1-2              Rhdf5lib_1.18.2        abind_1.4-5           
##  [40] scales_1.3.0           mvtnorm_1.2-4          DBI_1.2.2             
##  [43] rstatix_0.7.2          Rcpp_1.0.12            viridisLite_0.4.2     
##  [46] xtable_1.8-4           emdbook_1.3.13         bit_4.0.5             
##  [49] httr_1.4.7             RColorBrewer_1.1-3     farver_2.1.1          
##  [52] pkgconfig_2.0.3        XML_3.99-0.16.1        sass_0.4.9            
##  [55] locfit_1.5-9.9         utf8_1.2.4             labeling_0.4.3        
##  [58] tidyselect_1.2.1       rlang_1.1.3            reshape2_1.4.4        
##  [61] AnnotationDbi_1.58.0   munsell_0.5.0          tools_4.2.1           
##  [64] cachem_1.0.8           cli_3.6.2              generics_0.1.3        
##  [67] RSQLite_2.3.5          ade4_1.7-22            broom_1.0.5           
##  [70] evaluate_0.23          biomformat_1.24.0      fastmap_1.1.1         
##  [73] yaml_2.3.8             ragg_1.3.0             knitr_1.45            
##  [76] bit64_4.0.5            KEGGREST_1.36.3        nlme_3.1-164          
##  [79] compiler_4.2.1         rstudioapi_0.16.0      png_0.1-8             
##  [82] ggsignif_0.6.4         geneplotter_1.74.0     bslib_0.6.2           
##  [85] stringi_1.8.3          highr_0.10             lattice_0.22-6        
##  [88] ggsci_3.0.3            vegan_2.6-4            permute_0.9-7         
##  [91] multtest_2.52.0        vctrs_0.6.5            pillar_1.9.0          
##  [94] lifecycle_1.0.4        rhdf5filters_1.8.0     jquerylib_0.1.4       
##  [97] data.table_1.15.2      bitops_1.0-7           R6_2.5.1              
## [100] gridExtra_2.3          codetools_0.2-19       MASS_7.3-60.0.1       
## [103] rhdf5_2.40.0           withr_3.0.0            GenomeInfoDbData_1.2.8
## [106] mgcv_1.9-1             parallel_4.2.1         hms_1.1.3             
## [109] grid_4.2.1             coda_0.19-4.1          rmarkdown_2.26        
## [112] carData_3.0-5          bbmle_1.0.25.1         numDeriv_2016.8-1.1